doctra 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/engines/vlm/service.py +0 -12
- doctra/parsers/structured_pdf_parser.py +11 -60
- doctra/parsers/table_chart_extractor.py +8 -44
- doctra/ui/app.py +5 -32
- doctra/utils/progress.py +13 -69
- doctra/utils/structured_utils.py +45 -49
- doctra/version.py +1 -1
- {doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/METADATA +1 -1
- {doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/RECORD +12 -12
- {doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/WHEEL +0 -0
- {doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.3.1.dist-info → doctra-0.3.3.dist-info}/top_level.txt +0 -0
doctra/engines/vlm/service.py
CHANGED
@@ -19,7 +19,6 @@ class VLMStructuredExtractor:
|
|
19
19
|
chart = vlm.extract_chart("/abs/path/chart.jpg")
|
20
20
|
table = vlm.extract_table("/abs/path/table.jpg")
|
21
21
|
|
22
|
-
# Or with Anthropic:
|
23
22
|
vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY")
|
24
23
|
"""
|
25
24
|
|
@@ -32,8 +31,6 @@ class VLMStructuredExtractor:
|
|
32
31
|
):
|
33
32
|
"""
|
34
33
|
Initialize the VLMStructuredExtractor with provider configuration.
|
35
|
-
|
36
|
-
Sets up the VLM model for structured data extraction from images.
|
37
34
|
|
38
35
|
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
39
36
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
@@ -60,8 +57,6 @@ class VLMStructuredExtractor:
|
|
60
57
|
:raises Exception: If image processing or VLM call fails
|
61
58
|
"""
|
62
59
|
try:
|
63
|
-
# Normalize path and verify readability
|
64
|
-
# (get_image_from_local already absolutizes & raises if missing)
|
65
60
|
img = get_image_from_local(image_path)
|
66
61
|
if img.mode != "RGB":
|
67
62
|
img = img.convert("RGB")
|
@@ -71,15 +66,11 @@ class VLMStructuredExtractor:
|
|
71
66
|
|
72
67
|
return result
|
73
68
|
except Exception as e:
|
74
|
-
# Re-raise so caller can handle/log too
|
75
69
|
raise
|
76
70
|
|
77
71
|
def extract_chart(self, image_path: str) -> Chart:
|
78
72
|
"""
|
79
73
|
Extract structured chart data from an image.
|
80
|
-
|
81
|
-
Uses VLM to analyze a chart image and extract the data in a structured
|
82
|
-
format with title, headers, and rows.
|
83
74
|
|
84
75
|
:param image_path: Path to the chart image file
|
85
76
|
:return: Chart object containing extracted title, headers, and data rows
|
@@ -96,9 +87,6 @@ class VLMStructuredExtractor:
|
|
96
87
|
def extract_table(self, image_path: str) -> Table:
|
97
88
|
"""
|
98
89
|
Extract structured table data from an image.
|
99
|
-
|
100
|
-
Uses VLM to analyze a table image and extract the data in a structured
|
101
|
-
format with title, headers, and rows.
|
102
90
|
|
103
91
|
:param image_path: Path to the table image file
|
104
92
|
:return: Table object containing extracted title, headers, and data rows
|
@@ -64,22 +64,19 @@ class StructuredPDFParser:
|
|
64
64
|
):
|
65
65
|
"""
|
66
66
|
Initialize the StructuredPDFParser with processing configuration.
|
67
|
-
|
68
|
-
Sets up the layout detection engine, OCR engine, and optionally
|
69
|
-
the VLM service for comprehensive document processing.
|
70
67
|
|
71
|
-
:param use_vlm: Whether to use VLM for structured data extraction
|
72
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
68
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
69
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
73
70
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
74
|
-
:param vlm_api_key: API key for VLM provider
|
75
|
-
:param layout_model_name: Layout detection model name
|
76
|
-
:param dpi: DPI for PDF rendering
|
77
|
-
:param min_score: Minimum confidence score for layout detection
|
78
|
-
:param ocr_lang: OCR language code
|
79
|
-
:param ocr_psm: Tesseract page segmentation mode
|
80
|
-
:param ocr_oem: Tesseract OCR engine mode
|
81
|
-
:param ocr_extra_config: Additional Tesseract configuration
|
82
|
-
:param box_separator: Separator between text boxes in output
|
71
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
72
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
73
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
74
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
75
|
+
:param ocr_lang: OCR language code (default: "eng")
|
76
|
+
:param ocr_psm: Tesseract page segmentation mode (default: 4)
|
77
|
+
:param ocr_oem: Tesseract OCR engine mode (default: 3)
|
78
|
+
:param ocr_extra_config: Additional Tesseract configuration (default: "")
|
79
|
+
:param box_separator: Separator between text boxes in output (default: "\n")
|
83
80
|
"""
|
84
81
|
self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
|
85
82
|
self.dpi = dpi
|
@@ -100,15 +97,10 @@ class StructuredPDFParser:
|
|
100
97
|
def parse(self, pdf_path: str) -> None:
|
101
98
|
"""
|
102
99
|
Parse a PDF document and extract all content types.
|
103
|
-
|
104
|
-
Processes the PDF through layout detection, extracts text using OCR,
|
105
|
-
saves images for visual elements, and optionally converts charts/tables
|
106
|
-
to structured data using VLM.
|
107
100
|
|
108
101
|
:param pdf_path: Path to the input PDF file
|
109
102
|
:return: None
|
110
103
|
"""
|
111
|
-
# Extract filename without extension and create output directory
|
112
104
|
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
113
105
|
out_dir = f"outputs/{pdf_filename}/full_parse"
|
114
106
|
|
@@ -120,7 +112,6 @@ class StructuredPDFParser:
|
|
120
112
|
)
|
121
113
|
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
122
114
|
|
123
|
-
# Count for progress bars
|
124
115
|
fig_count = sum(sum(1 for b in p.boxes if b.label == "figure") for p in pages)
|
125
116
|
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages)
|
126
117
|
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages)
|
@@ -133,11 +124,8 @@ class StructuredPDFParser:
|
|
133
124
|
figures_desc = "Figures (cropped)"
|
134
125
|
|
135
126
|
with ExitStack() as stack:
|
136
|
-
# Enhanced environment detection
|
137
127
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
138
128
|
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
139
|
-
|
140
|
-
# Use appropriate progress bars based on environment
|
141
129
|
if is_notebook:
|
142
130
|
charts_bar = stack.enter_context(
|
143
131
|
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
@@ -165,13 +153,11 @@ class StructuredPDFParser:
|
|
165
153
|
rel = os.path.relpath(abs_img_path, out_dir)
|
166
154
|
|
167
155
|
if box.label == "figure":
|
168
|
-
# Figures are always images in MD
|
169
156
|
md_lines.append(f"\n")
|
170
157
|
if figures_bar: figures_bar.update(1)
|
171
158
|
|
172
159
|
elif box.label == "chart":
|
173
160
|
if self.use_vlm and self.vlm:
|
174
|
-
# Try structured → Markdown table; fallback to image if it fails
|
175
161
|
wrote_table = False
|
176
162
|
try:
|
177
163
|
chart = self.vlm.extract_chart(abs_img_path)
|
@@ -193,7 +179,6 @@ class StructuredPDFParser:
|
|
193
179
|
|
194
180
|
elif box.label == "table":
|
195
181
|
if self.use_vlm and self.vlm:
|
196
|
-
# Try structured → Markdown table; fallback to image if it fails
|
197
182
|
wrote_table = False
|
198
183
|
try:
|
199
184
|
table = self.vlm.extract_table(abs_img_path)
|
@@ -229,7 +214,6 @@ class StructuredPDFParser:
|
|
229
214
|
html_structured_path = os.path.join(out_dir, "tables.html")
|
230
215
|
write_structured_html(html_structured_path, structured_items)
|
231
216
|
|
232
|
-
# Print completion message with output directory
|
233
217
|
print(f"✅ Parsing completed successfully!")
|
234
218
|
print(f"📁 Output directory: {out_dir}")
|
235
219
|
|
@@ -249,30 +233,25 @@ class StructuredPDFParser:
|
|
249
233
|
:param save_path: Optional path to save the visualization (if None, displays only)
|
250
234
|
:return: None
|
251
235
|
"""
|
252
|
-
# Get layout predictions
|
253
236
|
pages: List[LayoutPage] = self.layout_engine.predict_pdf(
|
254
237
|
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
255
238
|
)
|
256
239
|
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
257
240
|
|
258
|
-
# Limit to requested number of pages
|
259
241
|
pages_to_show = min(num_pages, len(pages))
|
260
242
|
|
261
243
|
if pages_to_show == 0:
|
262
244
|
print("No pages to display")
|
263
245
|
return
|
264
246
|
|
265
|
-
# Calculate grid dimensions
|
266
247
|
rows = (pages_to_show + cols - 1) // cols
|
267
248
|
|
268
|
-
# Collect unique labels from the processed pages and assign colors
|
269
249
|
used_labels = set()
|
270
250
|
for idx in range(pages_to_show):
|
271
251
|
page = pages[idx]
|
272
252
|
for box in page.boxes:
|
273
253
|
used_labels.add(box.label.lower())
|
274
254
|
|
275
|
-
# Create dynamic color assignment for all detected labels
|
276
255
|
base_colors = ['#3B82F6', '#EF4444', '#10B981', '#F59E0B', '#8B5CF6',
|
277
256
|
'#F97316', '#EC4899', '#6B7280', '#84CC16', '#06B6D4',
|
278
257
|
'#DC2626', '#059669', '#7C3AED', '#DB2777', '#0891B2']
|
@@ -281,22 +260,18 @@ class StructuredPDFParser:
|
|
281
260
|
for i, label in enumerate(sorted(used_labels)):
|
282
261
|
dynamic_label_colors[label] = base_colors[i % len(base_colors)]
|
283
262
|
|
284
|
-
# Process each page and add bounding boxes
|
285
263
|
processed_pages = []
|
286
264
|
|
287
265
|
for idx in range(pages_to_show):
|
288
266
|
page = pages[idx]
|
289
267
|
page_img = pil_pages[idx].copy()
|
290
268
|
|
291
|
-
# Calculate scale factor to resize to target width
|
292
269
|
scale_factor = page_width / page_img.width
|
293
270
|
new_height = int(page_img.height * scale_factor)
|
294
271
|
page_img = page_img.resize((page_width, new_height), Image.LANCZOS)
|
295
272
|
|
296
|
-
# Create drawing context
|
297
273
|
draw = ImageDraw.Draw(page_img)
|
298
274
|
|
299
|
-
# Try to load a nice font, fallback to default
|
300
275
|
try:
|
301
276
|
font = ImageFont.truetype("arial.ttf", 24)
|
302
277
|
small_font = ImageFont.truetype("arial.ttf", 18)
|
@@ -308,21 +283,16 @@ class StructuredPDFParser:
|
|
308
283
|
font = None
|
309
284
|
small_font = None
|
310
285
|
|
311
|
-
# Draw bounding boxes
|
312
286
|
for box in page.boxes:
|
313
|
-
# Scale coordinates
|
314
287
|
x1 = int(box.x1 * scale_factor)
|
315
288
|
y1 = int(box.y1 * scale_factor)
|
316
289
|
x2 = int(box.x2 * scale_factor)
|
317
290
|
y2 = int(box.y2 * scale_factor)
|
318
291
|
|
319
|
-
# Get color for this label from dynamic assignment
|
320
292
|
color = dynamic_label_colors.get(box.label.lower(), '#000000')
|
321
293
|
|
322
|
-
# Draw rectangle with rounded corners effect
|
323
294
|
draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
|
324
295
|
|
325
|
-
# Draw label background
|
326
296
|
label_text = f"{box.label} ({box.score:.2f})"
|
327
297
|
if font:
|
328
298
|
bbox = draw.textbbox((0, 0), label_text, font=small_font)
|
@@ -332,11 +302,9 @@ class StructuredPDFParser:
|
|
332
302
|
text_width = len(label_text) * 8
|
333
303
|
text_height = 15
|
334
304
|
|
335
|
-
# Position label above the box
|
336
305
|
label_x = x1
|
337
306
|
label_y = max(0, y1 - text_height - 8)
|
338
307
|
|
339
|
-
# Draw label background with padding
|
340
308
|
padding = 4
|
341
309
|
draw.rectangle([
|
342
310
|
label_x - padding,
|
@@ -345,10 +313,8 @@ class StructuredPDFParser:
|
|
345
313
|
label_y + text_height + padding
|
346
314
|
], fill='white', outline=color, width=2)
|
347
315
|
|
348
|
-
# Draw label text
|
349
316
|
draw.text((label_x, label_y), label_text, fill=color, font=small_font)
|
350
317
|
|
351
|
-
# Add page title
|
352
318
|
title_text = f"Page {page.page_index} ({len(page.boxes)} boxes)"
|
353
319
|
if font:
|
354
320
|
title_bbox = draw.textbbox((0, 0), title_text, font=font)
|
@@ -356,7 +322,6 @@ class StructuredPDFParser:
|
|
356
322
|
else:
|
357
323
|
title_width = len(title_text) * 12
|
358
324
|
|
359
|
-
# Draw title background
|
360
325
|
title_x = (page_width - title_width) // 2
|
361
326
|
title_y = 10
|
362
327
|
draw.rectangle([title_x - 10, title_y - 5, title_x + title_width + 10, title_y + 35],
|
@@ -365,16 +330,13 @@ class StructuredPDFParser:
|
|
365
330
|
|
366
331
|
processed_pages.append(page_img)
|
367
332
|
|
368
|
-
# Create grid layout with space for legend
|
369
333
|
legend_width = 250
|
370
334
|
grid_width = cols * page_width + (cols - 1) * spacing
|
371
335
|
total_width = grid_width + legend_width + spacing
|
372
336
|
grid_height = rows * (processed_pages[0].height if processed_pages else 600) + (rows - 1) * spacing
|
373
337
|
|
374
|
-
# Create final grid image with modern background
|
375
338
|
final_img = Image.new('RGB', (total_width, grid_height), '#F8FAFC')
|
376
339
|
|
377
|
-
# Place pages in grid
|
378
340
|
for idx, page_img in enumerate(processed_pages):
|
379
341
|
row = idx // cols
|
380
342
|
col = idx % cols
|
@@ -384,13 +346,11 @@ class StructuredPDFParser:
|
|
384
346
|
|
385
347
|
final_img.paste(page_img, (x_pos, y_pos))
|
386
348
|
|
387
|
-
# Create legend
|
388
349
|
legend_x = grid_width + spacing
|
389
350
|
legend_y = 20
|
390
351
|
|
391
352
|
draw_legend = ImageDraw.Draw(final_img)
|
392
353
|
|
393
|
-
# Legend title
|
394
354
|
legend_title = "Element Types"
|
395
355
|
if font:
|
396
356
|
title_bbox = draw_legend.textbbox((0, 0), legend_title, font=font)
|
@@ -400,47 +360,38 @@ class StructuredPDFParser:
|
|
400
360
|
title_width = len(legend_title) * 12
|
401
361
|
title_height = 20
|
402
362
|
|
403
|
-
# Draw legend background
|
404
363
|
legend_bg_height = len(used_labels) * 35 + title_height + 40
|
405
364
|
draw_legend.rectangle([legend_x - 10, legend_y - 10,
|
406
365
|
legend_x + legend_width - 10, legend_y + legend_bg_height],
|
407
366
|
fill='white', outline='#E5E7EB', width=2)
|
408
367
|
|
409
|
-
# Draw legend title
|
410
368
|
draw_legend.text((legend_x + 10, legend_y + 5), legend_title,
|
411
369
|
fill='#1F2937', font=font)
|
412
370
|
|
413
|
-
# Draw legend items - now using dynamic colors for actually detected labels
|
414
371
|
current_y = legend_y + title_height + 20
|
415
372
|
|
416
373
|
for label in sorted(used_labels):
|
417
374
|
color = dynamic_label_colors[label]
|
418
375
|
|
419
|
-
# Draw color square
|
420
376
|
square_size = 20
|
421
377
|
draw_legend.rectangle([legend_x + 10, current_y,
|
422
378
|
legend_x + 10 + square_size, current_y + square_size],
|
423
379
|
fill=color, outline='#6B7280', width=1)
|
424
380
|
|
425
|
-
# Draw label text
|
426
381
|
draw_legend.text((legend_x + 40, current_y + 2), label.title(),
|
427
382
|
fill='#374151', font=small_font)
|
428
383
|
|
429
384
|
current_y += 30
|
430
385
|
|
431
|
-
# Save or display
|
432
386
|
if save_path:
|
433
387
|
final_img.save(save_path, quality=95, optimize=True)
|
434
388
|
print(f"Layout visualization saved to: {save_path}")
|
435
389
|
else:
|
436
|
-
# Display using PIL's default viewer
|
437
390
|
final_img.show()
|
438
391
|
|
439
|
-
# Print summary statistics
|
440
392
|
print(f"\n📊 Layout Detection Summary for {os.path.basename(pdf_path)}:")
|
441
393
|
print(f"Pages processed: {pages_to_show}")
|
442
394
|
|
443
|
-
# Create summary by label across all pages
|
444
395
|
total_counts = {}
|
445
396
|
for idx in range(pages_to_show):
|
446
397
|
page = pages[idx]
|
@@ -61,22 +61,17 @@ class ChartTablePDFParser:
|
|
61
61
|
):
|
62
62
|
"""
|
63
63
|
Initialize the ChartTablePDFParser with extraction configuration.
|
64
|
-
|
65
|
-
Sets up the layout detection engine and optionally the VLM service
|
66
|
-
for structured data extraction.
|
67
64
|
|
68
|
-
:param extract_charts: Whether to extract charts from the document
|
69
|
-
:param extract_tables: Whether to extract tables from the document
|
70
|
-
:param use_vlm: Whether to use VLM for structured data extraction
|
71
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter")
|
65
|
+
:param extract_charts: Whether to extract charts from the document (default: True)
|
66
|
+
:param extract_tables: Whether to extract tables from the document (default: True)
|
67
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
68
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
72
69
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
73
|
-
:param vlm_api_key: API key for VLM provider
|
74
|
-
:param layout_model_name: Layout detection model name
|
75
|
-
:param dpi: DPI for PDF rendering
|
76
|
-
:param min_score: Minimum confidence score for layout detection
|
77
|
-
:raises ValueError: If neither extract_charts nor extract_tables is True
|
70
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
71
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
72
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
73
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
78
74
|
"""
|
79
|
-
# Validation
|
80
75
|
if not extract_charts and not extract_tables:
|
81
76
|
raise ValueError("At least one of extract_charts or extract_tables must be True")
|
82
77
|
|
@@ -98,21 +93,15 @@ class ChartTablePDFParser:
|
|
98
93
|
def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
|
99
94
|
"""
|
100
95
|
Parse a PDF document and extract charts and/or tables.
|
101
|
-
|
102
|
-
Processes the PDF through layout detection, extracts the specified
|
103
|
-
element types, saves cropped images, and optionally converts them
|
104
|
-
to structured data using VLM.
|
105
96
|
|
106
97
|
:param pdf_path: Path to the input PDF file
|
107
98
|
:param output_base_dir: Base directory for output files (default: "outputs")
|
108
99
|
:return: None
|
109
100
|
"""
|
110
|
-
# Create output directory structure: outputs/<filename>/structured_parsing/
|
111
101
|
pdf_name = Path(pdf_path).stem
|
112
102
|
out_dir = os.path.join(output_base_dir, pdf_name, "structured_parsing")
|
113
103
|
os.makedirs(out_dir, exist_ok=True)
|
114
104
|
|
115
|
-
# Create subdirectories based on what we're extracting
|
116
105
|
charts_dir = None
|
117
106
|
tables_dir = None
|
118
107
|
|
@@ -129,24 +118,20 @@ class ChartTablePDFParser:
|
|
129
118
|
)
|
130
119
|
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
131
120
|
|
132
|
-
# Determine which labels to extract
|
133
121
|
target_labels = []
|
134
122
|
if self.extract_charts:
|
135
123
|
target_labels.append("chart")
|
136
124
|
if self.extract_tables:
|
137
125
|
target_labels.append("table")
|
138
126
|
|
139
|
-
# Count items for progress bars
|
140
127
|
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
|
141
128
|
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
|
142
129
|
|
143
|
-
# Prepare output content
|
144
130
|
if self.use_vlm:
|
145
131
|
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
146
132
|
structured_items: List[Dict[str, Any]] = []
|
147
133
|
vlm_items: List[Dict[str, Any]] = []
|
148
134
|
|
149
|
-
# Progress bar descriptions
|
150
135
|
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
151
136
|
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
152
137
|
|
@@ -154,11 +139,9 @@ class ChartTablePDFParser:
|
|
154
139
|
table_counter = 1
|
155
140
|
|
156
141
|
with ExitStack() as stack:
|
157
|
-
# Enhanced environment detection
|
158
142
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
159
143
|
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
160
144
|
|
161
|
-
# Use appropriate progress bars based on environment
|
162
145
|
if is_notebook:
|
163
146
|
charts_bar = stack.enter_context(
|
164
147
|
create_notebook_friendly_bar(total=chart_count, desc=charts_desc)) if chart_count else None
|
@@ -174,23 +157,19 @@ class ChartTablePDFParser:
|
|
174
157
|
page_num = p.page_index
|
175
158
|
page_img: Image.Image = pil_pages[page_num - 1]
|
176
159
|
|
177
|
-
# Only process selected item types
|
178
160
|
target_items = [box for box in p.boxes if box.label in target_labels]
|
179
161
|
|
180
162
|
if target_items and self.use_vlm:
|
181
163
|
md_lines.append(f"\n## Page {page_num}\n")
|
182
164
|
|
183
165
|
for box in sorted(target_items, key=reading_order_key):
|
184
|
-
# Handle charts
|
185
166
|
if box.label == "chart" and self.extract_charts:
|
186
167
|
chart_filename = f"chart_{chart_counter:03d}.png"
|
187
168
|
chart_path = os.path.join(charts_dir, chart_filename)
|
188
169
|
|
189
|
-
# Save image
|
190
170
|
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
191
171
|
cropped_img.save(chart_path)
|
192
172
|
|
193
|
-
# Handle VLM processing if enabled
|
194
173
|
if self.use_vlm and self.vlm:
|
195
174
|
rel_path = os.path.join("charts", chart_filename)
|
196
175
|
wrote_table = False
|
@@ -227,16 +206,13 @@ class ChartTablePDFParser:
|
|
227
206
|
if charts_bar:
|
228
207
|
charts_bar.update(1)
|
229
208
|
|
230
|
-
# Handle tables
|
231
209
|
elif box.label == "table" and self.extract_tables:
|
232
210
|
table_filename = f"table_{table_counter:03d}.png"
|
233
211
|
table_path = os.path.join(tables_dir, table_filename)
|
234
212
|
|
235
|
-
# Save image
|
236
213
|
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
237
214
|
cropped_img.save(table_path)
|
238
215
|
|
239
|
-
# Handle VLM processing if enabled
|
240
216
|
if self.use_vlm and self.vlm:
|
241
217
|
rel_path = os.path.join("tables", table_filename)
|
242
218
|
wrote_table = False
|
@@ -273,19 +249,11 @@ class ChartTablePDFParser:
|
|
273
249
|
if tables_bar:
|
274
250
|
tables_bar.update(1)
|
275
251
|
|
276
|
-
# Write outputs only if VLM is used
|
277
|
-
md_path = None
|
278
252
|
excel_path = None
|
279
253
|
|
280
254
|
if self.use_vlm:
|
281
|
-
# Write markdown file
|
282
|
-
md_path = os.path.join(out_dir, "charts.md")
|
283
|
-
with open(md_path, 'w', encoding='utf-8') as f:
|
284
|
-
f.write('\n'.join(md_lines))
|
285
255
|
|
286
|
-
# Write Excel file if we have structured data
|
287
256
|
if structured_items:
|
288
|
-
# Determine Excel filename based on extraction target
|
289
257
|
if self.extract_charts and self.extract_tables:
|
290
258
|
excel_filename = "parsed_tables_charts.xlsx"
|
291
259
|
elif self.extract_charts:
|
@@ -299,23 +267,19 @@ class ChartTablePDFParser:
|
|
299
267
|
excel_path = os.path.join(out_dir, excel_filename)
|
300
268
|
write_structured_excel(excel_path, structured_items)
|
301
269
|
|
302
|
-
# Also create HTML version
|
303
270
|
html_filename = excel_filename.replace('.xlsx', '.html')
|
304
271
|
html_path = os.path.join(out_dir, html_filename)
|
305
272
|
write_structured_html(html_path, structured_items)
|
306
273
|
|
307
|
-
# Write VLM items mapping for UI linkage
|
308
274
|
if 'vlm_items' in locals() and vlm_items:
|
309
275
|
with open(os.path.join(out_dir, "vlm_items.json"), 'w', encoding='utf-8') as jf:
|
310
276
|
json.dump(vlm_items, jf, ensure_ascii=False, indent=2)
|
311
277
|
|
312
|
-
# Print results
|
313
278
|
extraction_types = []
|
314
279
|
if self.extract_charts:
|
315
280
|
extraction_types.append("charts")
|
316
281
|
if self.extract_tables:
|
317
282
|
extraction_types.append("tables")
|
318
283
|
|
319
|
-
# Print completion message with output directory
|
320
284
|
print(f"✅ Parsing completed successfully!")
|
321
285
|
print(f"📁 Output directory: {out_dir}")
|
doctra/ui/app.py
CHANGED
@@ -17,13 +17,10 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
|
|
17
17
|
|
18
18
|
if out_dir.exists():
|
19
19
|
if is_structured_parsing:
|
20
|
-
# For structured parsing, show ALL files in the directory
|
21
20
|
for file_path in sorted(out_dir.rglob("*")):
|
22
21
|
if file_path.is_file():
|
23
22
|
file_paths.append(str(file_path))
|
24
23
|
else:
|
25
|
-
# For full parsing, use the original logic
|
26
|
-
# Always add main output files (HTML, Markdown, etc.) regardless of allowed_kinds
|
27
24
|
main_files = [
|
28
25
|
"result.html",
|
29
26
|
"result.md",
|
@@ -36,22 +33,18 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
|
|
36
33
|
if file_path.exists():
|
37
34
|
file_paths.append(str(file_path))
|
38
35
|
|
39
|
-
# Add image files based on allowed_kinds or all images if not specified
|
40
36
|
if allowed_kinds:
|
41
37
|
for kind in allowed_kinds:
|
42
|
-
# ChartTablePDFParser saves directly to charts/ and tables/ directories
|
43
38
|
p = out_dir / kind
|
44
39
|
if p.exists():
|
45
|
-
for img in sorted(p.glob("*.png")):
|
40
|
+
for img in sorted(p.glob("*.png")):
|
46
41
|
file_paths.append(str(img))
|
47
42
|
|
48
|
-
# Also check images/ subdirectories (for StructuredPDFParser)
|
49
43
|
images_dir = out_dir / "images" / kind
|
50
44
|
if images_dir.exists():
|
51
|
-
for img in sorted(images_dir.glob("*.jpg")):
|
45
|
+
for img in sorted(images_dir.glob("*.jpg")):
|
52
46
|
file_paths.append(str(img))
|
53
47
|
else:
|
54
|
-
# Fallback: look in both direct directories and images/ subdirectories
|
55
48
|
for p in (out_dir / "charts").glob("*.png"):
|
56
49
|
file_paths.append(str(p))
|
57
50
|
for p in (out_dir / "tables").glob("*.png"):
|
@@ -59,7 +52,6 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
|
|
59
52
|
for p in (out_dir / "images").rglob("*.jpg"):
|
60
53
|
file_paths.append(str(p))
|
61
54
|
|
62
|
-
# Add Excel files based on extraction target (for structured parsing)
|
63
55
|
if allowed_kinds:
|
64
56
|
if "charts" in allowed_kinds and "tables" in allowed_kinds:
|
65
57
|
excel_files = ["parsed_tables_charts.xlsx"]
|
@@ -77,30 +69,24 @@ def _gather_outputs(out_dir: Path, allowed_kinds: Optional[List[str]] = None, zi
|
|
77
69
|
|
78
70
|
kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
|
79
71
|
for sub in kinds:
|
80
|
-
# Look in both direct directories and images/ subdirectories
|
81
|
-
# First try direct directories (for ChartTablePDFParser)
|
82
72
|
p = out_dir / sub
|
83
73
|
if p.exists():
|
84
|
-
for img in sorted(p.glob("*.png")):
|
74
|
+
for img in sorted(p.glob("*.png")):
|
85
75
|
gallery_items.append((str(img), f"{sub}: {img.name}"))
|
86
76
|
|
87
|
-
# Also try images/ subdirectories (for StructuredPDFParser)
|
88
77
|
images_dir = out_dir / "images" / sub
|
89
78
|
if images_dir.exists():
|
90
|
-
for img in sorted(images_dir.glob("*.jpg")):
|
79
|
+
for img in sorted(images_dir.glob("*.jpg")):
|
91
80
|
gallery_items.append((str(img), f"{sub}: {img.name}"))
|
92
81
|
|
93
82
|
tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
|
94
83
|
|
95
|
-
# Use custom filename if provided, otherwise use default
|
96
84
|
if zip_filename:
|
97
|
-
# Clean the filename to be safe for file systems
|
98
85
|
safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
|
99
86
|
zip_base = tmp_zip_dir / safe_filename
|
100
87
|
else:
|
101
88
|
zip_base = tmp_zip_dir / "doctra_outputs"
|
102
89
|
|
103
|
-
# Create a filtered copy of the output directory excluding temp files
|
104
90
|
filtered_dir = tmp_zip_dir / "filtered_outputs"
|
105
91
|
shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
|
106
92
|
|
@@ -125,13 +111,10 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
|
|
125
111
|
while i < len(lines):
|
126
112
|
line = lines[i].strip()
|
127
113
|
|
128
|
-
# Check for page header
|
129
114
|
if line.startswith('## Page '):
|
130
|
-
# Save previous page if exists
|
131
115
|
if current_page:
|
132
116
|
pages.append(current_page)
|
133
117
|
|
134
|
-
# Start new page
|
135
118
|
page_num = line.replace('## Page ', '').strip()
|
136
119
|
current_page = {
|
137
120
|
'page_num': page_num,
|
@@ -145,15 +128,12 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
|
|
145
128
|
i += 1
|
146
129
|
continue
|
147
130
|
|
148
|
-
# Check for images (tables, charts, figures)
|
149
131
|
if line.startswith('\]\(([^)]+)\)', line)
|
152
133
|
if match:
|
153
134
|
caption = match.group(1)
|
154
135
|
img_path = match.group(2)
|
155
136
|
|
156
|
-
# Categorize by type
|
157
137
|
if 'Table' in caption:
|
158
138
|
current_page['tables'].append({'caption': caption, 'path': img_path})
|
159
139
|
elif 'Chart' in caption:
|
@@ -163,18 +143,15 @@ def _parse_markdown_by_pages(md_content: str) -> List[Dict[str, Any]]:
|
|
163
143
|
|
164
144
|
current_page['images'].append({'caption': caption, 'path': img_path})
|
165
145
|
|
166
|
-
# Add to full content with proper markdown formatting
|
167
146
|
current_page['full_content'].append(f"")
|
168
147
|
|
169
|
-
# Regular content
|
170
148
|
elif current_page:
|
171
|
-
if line:
|
149
|
+
if line:
|
172
150
|
current_page['content'].append(line)
|
173
151
|
current_page['full_content'].append(line)
|
174
152
|
|
175
153
|
i += 1
|
176
154
|
|
177
|
-
# Add the last page
|
178
155
|
if current_page:
|
179
156
|
pages.append(current_page)
|
180
157
|
|
@@ -198,12 +175,9 @@ def run_full_parse(
|
|
198
175
|
if not pdf_file:
|
199
176
|
return ("No file provided.", None, [], [], "")
|
200
177
|
|
201
|
-
# Extract filename from the uploaded file path
|
202
|
-
# Gradio provides the original filename in the file path
|
203
178
|
original_filename = Path(pdf_file).stem
|
204
179
|
|
205
180
|
tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
|
206
|
-
# Use original filename for temp file so parser creates correct output directory
|
207
181
|
input_pdf = tmp_dir / f"{original_filename}.pdf"
|
208
182
|
shutil.copy2(pdf_file, input_pdf)
|
209
183
|
|
@@ -295,7 +269,6 @@ def run_extract(
|
|
295
269
|
original_filename = Path(pdf_file).stem
|
296
270
|
|
297
271
|
tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
|
298
|
-
# Use original filename for temp file so parser creates correct output directory
|
299
272
|
input_pdf = tmp_dir / f"{original_filename}.pdf"
|
300
273
|
shutil.copy2(pdf_file, input_pdf)
|
301
274
|
|
doctra/utils/progress.py
CHANGED
@@ -40,7 +40,6 @@ def _detect_environment() -> Tuple[bool, bool, bool]:
|
|
40
40
|
Returns (is_notebook, is_tty, is_windows).
|
41
41
|
"""
|
42
42
|
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
43
|
-
# Colab/Kaggle specifics
|
44
43
|
if "google.colab" in sys.modules:
|
45
44
|
is_notebook = True
|
46
45
|
if "kaggle_secrets" in sys.modules or "kaggle_web_client" in sys.modules:
|
@@ -59,7 +58,6 @@ def _select_emoji(key: str) -> str:
|
|
59
58
|
- ascii: ASCII text tokens
|
60
59
|
- none: empty prefix
|
61
60
|
"""
|
62
|
-
# Maps
|
63
61
|
default_map = {
|
64
62
|
"loading": "🔄",
|
65
63
|
"charts": "📊",
|
@@ -70,14 +68,13 @@ def _select_emoji(key: str) -> str:
|
|
70
68
|
"processing": "⚙️",
|
71
69
|
}
|
72
70
|
safe_map = {
|
73
|
-
# Use BMP or geometric shapes likely to render everywhere
|
74
71
|
"loading": "⏳",
|
75
72
|
"charts": "▦",
|
76
73
|
"tables": "▤",
|
77
74
|
"figures": "▧",
|
78
75
|
"ocr": "🔎",
|
79
76
|
"vlm": "★",
|
80
|
-
"processing": "⚙",
|
77
|
+
"processing": "⚙",
|
81
78
|
}
|
82
79
|
ascii_map = {
|
83
80
|
"loading": "[loading]",
|
@@ -89,13 +86,11 @@ def _select_emoji(key: str) -> str:
|
|
89
86
|
"processing": "[processing]",
|
90
87
|
}
|
91
88
|
|
92
|
-
# Determine effective mode
|
93
89
|
mode = _PROGRESS_CONFIG.emoji_mode
|
94
90
|
is_notebook, _, is_windows = _detect_environment()
|
95
91
|
if not _PROGRESS_CONFIG.use_emoji:
|
96
92
|
mode = "none"
|
97
93
|
elif mode == "default":
|
98
|
-
# Heuristics: prefer safe in Colab/Kaggle notebooks and Windows terminals
|
99
94
|
if is_windows or "google.colab" in sys.modules or "kaggle_secrets" in sys.modules:
|
100
95
|
mode = "safe"
|
101
96
|
|
@@ -105,7 +100,6 @@ def _select_emoji(key: str) -> str:
|
|
105
100
|
return ascii_map.get(key, "")
|
106
101
|
if mode == "safe":
|
107
102
|
return safe_map.get(key, safe_map["processing"])
|
108
|
-
# default
|
109
103
|
return default_map.get(key, default_map["processing"])
|
110
104
|
|
111
105
|
|
@@ -119,17 +113,13 @@ def _supports_unicode_output() -> bool:
|
|
119
113
|
except Exception:
|
120
114
|
pass
|
121
115
|
|
122
|
-
# Heuristics for common notebook environments that support emoji
|
123
116
|
env = os.environ
|
124
117
|
if any(k in env for k in ("COLAB_GPU", "GCE_METADATA_HOST", "KAGGLE_KERNEL_RUN_TYPE", "JPY_PARENT_PID")):
|
125
118
|
return True
|
126
119
|
|
127
|
-
# On modern Windows terminals with UTF-8 code page, assume yes
|
128
120
|
if sys.platform.startswith("win"):
|
129
|
-
# If user opted-in to force ASCII, respect it
|
130
121
|
if _PROGRESS_CONFIG.force_ascii:
|
131
122
|
return False
|
132
|
-
# Try to detect WT/Terminal/VSCode which usually handle Unicode
|
133
123
|
if any(k in env for k in ("WT_SESSION", "TERM_PROGRAM", "VSCODE_PID")):
|
134
124
|
return True
|
135
125
|
|
@@ -161,19 +151,15 @@ def create_beautiful_progress_bar(
|
|
161
151
|
:return: Configured tqdm progress bar instance
|
162
152
|
"""
|
163
153
|
|
164
|
-
# Enhanced styling parameters - notebook-friendly format
|
165
154
|
is_notebook, is_tty, is_windows = _detect_environment()
|
166
155
|
if is_notebook:
|
167
|
-
# Simpler format for notebooks to avoid display issues
|
168
156
|
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
|
169
157
|
else:
|
170
|
-
# Full format for terminal
|
171
158
|
bar_format = (
|
172
159
|
"{l_bar}{bar:30}| {n_fmt}/{total_fmt} "
|
173
160
|
"[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
|
174
161
|
)
|
175
162
|
|
176
|
-
# Color schemes based on operation type
|
177
163
|
color_schemes = {
|
178
164
|
"loading": {"colour": "cyan", "ncols": 100},
|
179
165
|
"charts": {"colour": "green", "ncols": 100},
|
@@ -184,7 +170,6 @@ def create_beautiful_progress_bar(
|
|
184
170
|
"processing": {"colour": "white", "ncols": 100},
|
185
171
|
}
|
186
172
|
|
187
|
-
# Determine color scheme based on description
|
188
173
|
desc_lower = desc.lower()
|
189
174
|
if "loading" in desc_lower or "model" in desc_lower:
|
190
175
|
color_scheme = color_schemes["loading"]
|
@@ -201,45 +186,37 @@ def create_beautiful_progress_bar(
|
|
201
186
|
else:
|
202
187
|
color_scheme = color_schemes["processing"]
|
203
188
|
|
204
|
-
# Emoji categories
|
205
189
|
emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
|
206
190
|
|
207
|
-
# Add appropriate emoji to description (can be disabled)
|
208
191
|
if _PROGRESS_CONFIG.use_emoji:
|
209
192
|
prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
|
210
193
|
prefix = _select_emoji(prefix_key)
|
211
194
|
if prefix:
|
212
195
|
desc = f"{prefix} {desc}"
|
213
196
|
|
214
|
-
# Enhanced tqdm configuration
|
215
197
|
tqdm_config = {
|
216
198
|
"total": total,
|
217
199
|
"desc": desc,
|
218
200
|
"leave": leave,
|
219
201
|
"bar_format": bar_format,
|
220
202
|
"ncols": _PROGRESS_CONFIG.ncols_env or color_scheme["ncols"],
|
221
|
-
# Prefer Unicode unless user forces ASCII or environment lacks Unicode support
|
222
203
|
"ascii": _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output(),
|
223
|
-
"dynamic_ncols": True,
|
224
|
-
"smoothing": 0.3,
|
225
|
-
"mininterval": 0.1,
|
226
|
-
"maxinterval": 1.0,
|
204
|
+
"dynamic_ncols": True,
|
205
|
+
"smoothing": 0.3,
|
206
|
+
"mininterval": 0.1,
|
207
|
+
"maxinterval": 1.0,
|
227
208
|
"position": position,
|
228
209
|
**kwargs
|
229
210
|
}
|
230
211
|
|
231
|
-
# Enhanced environment detection
|
232
212
|
is_notebook, is_terminal, is_windows = _detect_environment()
|
233
213
|
|
234
|
-
# Add color only for terminal environments (not notebooks)
|
235
214
|
if not is_notebook and is_terminal:
|
236
215
|
tqdm_config["colour"] = color_scheme["colour"]
|
237
216
|
|
238
|
-
# Respect global disable
|
239
217
|
if _PROGRESS_CONFIG.disable:
|
240
218
|
tqdm_config["disable"] = True
|
241
219
|
|
242
|
-
# Try creating the progress bar with Unicode, fallback to ASCII on failure (e.g., Windows code page)
|
243
220
|
if is_notebook:
|
244
221
|
tqdm_config.pop("colour", None)
|
245
222
|
try:
|
@@ -297,7 +274,6 @@ def update_progress_with_info(
|
|
297
274
|
:param info: Optional dictionary of information to display
|
298
275
|
"""
|
299
276
|
if info:
|
300
|
-
# Format info as postfix
|
301
277
|
postfix_parts = []
|
302
278
|
for key, value in info.items():
|
303
279
|
if isinstance(value, float):
|
@@ -354,54 +330,22 @@ def create_notebook_friendly_bar(
|
|
354
330
|
**kwargs
|
355
331
|
) -> tqdm:
|
356
332
|
"""
|
357
|
-
Create a notebook-friendly progress bar with
|
333
|
+
Create a notebook-friendly progress bar with consistent sizing.
|
358
334
|
|
359
|
-
This function creates progress bars
|
360
|
-
|
335
|
+
This function creates progress bars that match the main progress bar
|
336
|
+
styling and behavior in notebook environments.
|
361
337
|
|
362
338
|
:param total: Total number of items to process
|
363
339
|
:param desc: Description text for the progress bar
|
364
340
|
:param kwargs: Additional tqdm parameters
|
365
341
|
:return: Configured notebook-friendly progress bar
|
366
342
|
"""
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
kwargs["disable"] = False
|
372
|
-
# Prefer Unicode in notebooks if supported
|
373
|
-
if "ascii" not in kwargs:
|
374
|
-
kwargs["ascii"] = _PROGRESS_CONFIG.force_ascii or not _supports_unicode_output()
|
375
|
-
|
376
|
-
# Emoji categories
|
377
|
-
emoji_categories = {"loading", "charts", "tables", "figures", "ocr", "vlm", "processing"}
|
378
|
-
|
379
|
-
# Add appropriate emoji to description
|
380
|
-
desc_lower = desc.lower()
|
381
|
-
if _PROGRESS_CONFIG.use_emoji:
|
382
|
-
prefix_key = next((k for k in emoji_categories if k in desc_lower), "processing")
|
383
|
-
prefix = _select_emoji(prefix_key)
|
384
|
-
if prefix:
|
385
|
-
desc = f"{prefix} {desc}"
|
386
|
-
|
387
|
-
# Simple format for notebooks
|
388
|
-
bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt}"
|
389
|
-
|
390
|
-
tqdm_config = {
|
391
|
-
"total": total,
|
392
|
-
"desc": desc,
|
393
|
-
"leave": True,
|
394
|
-
"bar_format": bar_format,
|
395
|
-
"ncols": _PROGRESS_CONFIG.ncols_env or 80,
|
396
|
-
"ascii": kwargs.get("ascii", False),
|
397
|
-
"dynamic_ncols": False, # Fixed width for notebooks
|
398
|
-
"smoothing": 0.1, # Faster updates
|
399
|
-
"mininterval": 0.05,
|
400
|
-
"maxinterval": 0.5,
|
343
|
+
return create_beautiful_progress_bar(
|
344
|
+
total=total,
|
345
|
+
desc=desc,
|
346
|
+
leave=True,
|
401
347
|
**kwargs
|
402
|
-
|
403
|
-
|
404
|
-
return tqdm_auto(**tqdm_config)
|
348
|
+
)
|
405
349
|
|
406
350
|
|
407
351
|
def progress_for(iterable: Iterable[Any], desc: str, total: Optional[int] = None, leave: bool = True, **kwargs) -> Iterator[Any]:
|
doctra/utils/structured_utils.py
CHANGED
@@ -1,49 +1,45 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
from typing import Any, Dict, Optional
|
3
|
-
import json
|
4
|
-
|
5
|
-
try:
|
6
|
-
from pydantic import BaseModel # type: ignore
|
7
|
-
except Exception:
|
8
|
-
class BaseModel:
|
9
|
-
pass
|
10
|
-
|
11
|
-
def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
12
|
-
"""
|
13
|
-
Accepts a VLM result that might be:
|
14
|
-
- JSON string
|
15
|
-
- dict
|
16
|
-
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
-
Returns a normalized dict with keys: title, headers, rows — or None.
|
18
|
-
"""
|
19
|
-
if obj is None:
|
20
|
-
return None
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
return
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
return None
|
47
|
-
return {"title": title, "headers": headers, "rows": rows}
|
48
|
-
|
49
|
-
return None
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
import json
|
4
|
+
|
5
|
+
try:
|
6
|
+
from pydantic import BaseModel # type: ignore
|
7
|
+
except Exception:
|
8
|
+
class BaseModel:
|
9
|
+
pass
|
10
|
+
|
11
|
+
def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
12
|
+
"""
|
13
|
+
Accepts a VLM result that might be:
|
14
|
+
- JSON string
|
15
|
+
- dict
|
16
|
+
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
+
Returns a normalized dict with keys: title, headers, rows — or None.
|
18
|
+
"""
|
19
|
+
if obj is None:
|
20
|
+
return None
|
21
|
+
|
22
|
+
if isinstance(obj, str):
|
23
|
+
try:
|
24
|
+
obj = json.loads(obj)
|
25
|
+
except Exception:
|
26
|
+
return None
|
27
|
+
|
28
|
+
if isinstance(obj, BaseModel):
|
29
|
+
try:
|
30
|
+
return obj.model_dump()
|
31
|
+
except Exception:
|
32
|
+
try:
|
33
|
+
return obj.dict()
|
34
|
+
except Exception:
|
35
|
+
return None
|
36
|
+
|
37
|
+
if isinstance(obj, dict):
|
38
|
+
title = obj.get("title") or "Untitled"
|
39
|
+
headers = obj.get("headers") or []
|
40
|
+
rows = obj.get("rows") or []
|
41
|
+
if not isinstance(headers, list) or not isinstance(rows, list):
|
42
|
+
return None
|
43
|
+
return {"title": title, "headers": headers, "rows": rows}
|
44
|
+
|
45
|
+
return None
|
doctra/version.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
"""Version information for Doctra."""
|
2
|
-
__version__ = '0.3.
|
2
|
+
__version__ = '0.3.3'
|
@@ -1,5 +1,5 @@
|
|
1
1
|
doctra/__init__.py,sha256=ST_c2GWBoB0y_wpL1qsOeK4bR1RyJhMMn6I5VjVRI6Y,613
|
2
|
-
doctra/version.py,sha256
|
2
|
+
doctra/version.py,sha256=-8CkxAWlU-OCRJP3Yq9OGjh-4nS4-sU-LRjZ28K6oUw,62
|
3
3
|
doctra/cli/__init__.py,sha256=4PTujjYRShOOUlZ7PwuWckShPWLC4v4CYIhJpzgyv1k,911
|
4
4
|
doctra/cli/main.py,sha256=o_W1b5kx3xaTbWK6l4IYi0YLwffKBj5pQKflnlaG2Fw,35611
|
5
5
|
doctra/cli/utils.py,sha256=IghiUZQCOmXODC5-5smHGz2KeV4xqbP4avmA1Mggln0,11800
|
@@ -14,7 +14,7 @@ doctra/engines/ocr/pytesseract_engine.py,sha256=Imz2uwju6himkBiS8CH7DLxBRe-LtmMY
|
|
14
14
|
doctra/engines/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
doctra/engines/vlm/outlines_types.py,sha256=qL-G6MNiA5mxp1qAPVEFhOANp4NqVt_MQKseJCr_xXE,970
|
16
16
|
doctra/engines/vlm/provider.py,sha256=aE8Eo1U-8XqAimakNlT0-T4etIyCV8rZ3DwxdqbFeTc,3131
|
17
|
-
doctra/engines/vlm/service.py,sha256=
|
17
|
+
doctra/engines/vlm/service.py,sha256=4ExDbLmyyC3ICXxr7OSIqvbOdrwbIJek-DE54vAUgDA,4151
|
18
18
|
doctra/exporters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
doctra/exporters/excel_writer.py,sha256=U5Eb5SF7_ll1QveUapSWSkCRt3OEoisKEVUQ_7X8Wjo,7762
|
20
20
|
doctra/exporters/html_writer.py,sha256=OlW24Eg5bZcjldRHtd3GDD7RrajuRXj43EJpXIJkYf8,38810
|
@@ -23,10 +23,10 @@ doctra/exporters/markdown_table.py,sha256=4_OJIwG_WoIPYBzJx1njy_3tNVdkK6QKSP-P9r
|
|
23
23
|
doctra/exporters/markdown_writer.py,sha256=L7EjF2MB8jYX7XkZ3a3NeeEC8gnb0qzRPTzIN9tdfuw,1027
|
24
24
|
doctra/parsers/__init__.py,sha256=8M6LVzcWGpuTIK_1SMXML3ll7zK1CTHXGI5qXvqdm-A,206
|
25
25
|
doctra/parsers/layout_order.py,sha256=W6b-T11H907RZ2FaZwNvnYhmvH11rpUzxC5yLkdf28k,640
|
26
|
-
doctra/parsers/structured_pdf_parser.py,sha256=
|
27
|
-
doctra/parsers/table_chart_extractor.py,sha256=
|
26
|
+
doctra/parsers/structured_pdf_parser.py,sha256=QIZIS5SAaIdGiT8o7G_a4D-Cht7nVLGeSuVzqSYLn14,19160
|
27
|
+
doctra/parsers/table_chart_extractor.py,sha256=kSubqX0n0kVu_3jzX6QUyKmEGs9sG3Bg9kzUzn2wPHo,13733
|
28
28
|
doctra/ui/__init__.py,sha256=XzOOKeGSBnUREuDQiCIWds1asFSa2nypFQTJXwclROA,85
|
29
|
-
doctra/ui/app.py,sha256=
|
29
|
+
doctra/ui/app.py,sha256=WpXUWHSs7wSYNjY4iBOZJHsKGQ88jDytvOFIjuhqAGE,44031
|
30
30
|
doctra/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
doctra/utils/bbox.py,sha256=R2-95p0KiWvet3TH27TQVvCar7WJg6z0u3L21iEDF-A,674
|
32
32
|
doctra/utils/constants.py,sha256=ZWOvNDrvETbQ_pxHiX7vUW4J5Oj8_qnov0QacUOBizI,189
|
@@ -34,11 +34,11 @@ doctra/utils/file_ops.py,sha256=3IS0EQncs6Kaj27fcg2zxQX3xRSvtItIsyKGLYgeOgw,815
|
|
34
34
|
doctra/utils/io_utils.py,sha256=L1bWV4-ybs2j_3ZEN7GfQVgdC73JKVECVnpwKbP0dy0,219
|
35
35
|
doctra/utils/ocr_utils.py,sha256=Doa1uYBg3kRgRYd2aPq9fICHgHfrM_efdhZfI7jl6OM,780
|
36
36
|
doctra/utils/pdf_io.py,sha256=c8EY47Z1iqVtlLFHS_n0qGuXJ5ERFaMUd84ivXV0b9E,706
|
37
|
-
doctra/utils/progress.py,sha256=
|
37
|
+
doctra/utils/progress.py,sha256=IKQ_YErWSEd4hddYMUiCORy0_kW4TOYJM891HUEq2_E,11901
|
38
38
|
doctra/utils/quiet.py,sha256=5XPS-1CtJ0sVk6qgSQctdhr_wR8mP1xoJLoUbmkXROA,387
|
39
|
-
doctra/utils/structured_utils.py,sha256=
|
40
|
-
doctra-0.3.
|
41
|
-
doctra-0.3.
|
42
|
-
doctra-0.3.
|
43
|
-
doctra-0.3.
|
44
|
-
doctra-0.3.
|
39
|
+
doctra/utils/structured_utils.py,sha256=znC2zr80rZMfIV58lipZ8M4zPq6IF070pdwLBve1qiE,1251
|
40
|
+
doctra-0.3.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
41
|
+
doctra-0.3.3.dist-info/METADATA,sha256=GX4AvDkmBPFcmt0drF84Wy2WuiqB0ivNw_7bMEpHuMc,28298
|
42
|
+
doctra-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
43
|
+
doctra-0.3.3.dist-info/top_level.txt,sha256=jI7E8jHci2gP9y0GYaWxlg9jG0O5n3FjHJJPLXDXMds,7
|
44
|
+
doctra-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|